#####################################################################################################################
# Script: Naturalization and the transition to homeownership; an analysis of signalling in the Dutch housing market #                                                                                                                  #
# Floris Peters                                                                                                     #                                                                                                                  #
# Housing Studies                                                                                                   #
#####################################################################################################################


################
# Introduction #
################

# This file provides the syntax used to create all the tables and figures in the paper. 
# The dataset contains sensitive, micro level information. As such, for privacy reasons the data is only available to individuals employed at or affiliated to Statistics Netherlands. 
# The dataset can be found at the following location on the network of Statistics Netherlands: \\cbsp.nl\Productie\Projecten\SAL\209253UM_FP_SEC1\Werk\Floris\PhD\HS_housing
# The original data files from which the above dataset was constructed can be found at the following location on the network of Statistics Netherlands: \\Ssb2f\ssb24\SatSocSam\MACIMIDE\DATA  

#######################################################################################################################
#######################################################################################################################

#############
# Variables #
#############

# ID
# (Individual identification number)

# START
# (Time vector - start of a given time period)

# STOP
# (Time vector - end of a given time period)

# EVENT
# (Homeownership)
# [0] The event does not occur during this time period; 
# [1] The event occurs at the end of this time period

# HOMEOWNERSHIP
# (is an individual a homeowner?)
# [0] no; 
# [1] yes

# YSM
# (years since migration)

# NATURALISED
# (Naturalised - time-varying)
# [0] Not naturalised; 
# [1] Naturalised

# NATURALISATION
# (Naturalisation during the observation period)
# [0] No naturalisation during the observation period; 
# [1] Naturaisation during the observation period

# IMMIGRATIONYEAR
# (Year of first immigration to the Netherlands)

# GENDER
# [0] Female; 
# [1] Male

# AGEARRIVAL
# (Age at the moment of migration in years)

# AGEARRIVAL_SQR
# (Age at the moment of migration in years, squared)

# AGEARRIVAL_CAT 
# (Age at the moment of migration in categories) 
# [1] 20-24 years; 
# [2] 25-29 years; 
# [3] 30-34 years; 
# [4] 35-39 years; 
# [5] 40-44 years; 
# [6] 45-50 years

# PARTNER
# [1] No partner; 
# [2] Foreign-born foreign partner; 
# [3] Foreign born naturalised partner; 
# [4] native partner

# NATIVE_PARTNER
# [0] No native partner; 
# [1] Native partner

# PARTNER_ID
# (Partner identification number)

# CHILDREC
# (Children in the household in categories)
# [1] Children <18 in household; 
# [2] no children <18 in household

# INCOME_HH
# (CPI adjusted log disposable household income)

# EMPLOYMENT
# (Employment status)
# [0] Not employed; 
# [1] Employed

# EMPLOYMENT_DUR
# (Duration of employment)

# DEVELOPMENT
# (Human Development Index (HDI) score origin country)

# DEVCAT
# (Human Development index (HDI) score origin country in categories)
# [1] First quartile; 
# [2] Second quartile; 
# [3] Third quartile; 
# [4] Fourth quartile

# CULT_DIST
# (level of cultural distance between the origin country and the Netherlands, based on the Hofstede index)
    
# EU
# [0] Not EU country of origin; 
# [1] EU country of origin

# URBANISATION
# (level of urbanisation municipality of residence)
# [1] Very high
# [2] high
# [3] moderate
# [4] low
# [5] very low
# [9] unknown

# POST_2008
# [0] observation year <= 2008; 
# [1] observation year > 2008

# EDUCATION
# [0] missing; 
# [1] Low education; 
# [2] Middle education; 
# [3] High education

# TRUNCATION
# (does observation period end prematurely)
# [0] no; 
# [1] yes

#######################################################################################################################
#######################################################################################################################

#Upload the dataset (Data_cit_housing_main.sav) to R and load the necessary libraries#
library(Matrix)
library(optimx)
library(splines)
library(foreign)
library(dplyr)
library(plm)
dataset_main <- read.csv(file.choose(),header=T,sep=";")


###########     
# Table 1 #
###########

#select immigrants with employment
dataset_empl <- subset(dataset_main, EMPLOYMENT == 1)

#compute the survival function
dataset_empl$surv_table_1_empl <- Surv(dataset_empl$START, dataset_empl$STOP, dataset_empl$EVENT)

#table 1 regression: employed
table_1_empl <- coxph(surv_table_1_empl ~ NATURALISED + NATURALISATION + as.factor(GENDER) + AGEARRIVAL + AGEARRIVAL_SQR +
                        as.factor(PARTNER) + CHILDREC + INCOME_HH + EMPLOYMENT_DUR + as.factor(URBANISATION) + DEVELOPMENTREC +
                        EU + post2008, data = dataset_empl)
      
#select immigrants without employment
dataset_noempl <- subset(dataset_main, EMPLOYMENT == 0)

#compute the survival function
dataset_noempl$surv_table_1_noempl <- Surv(dataset_noempl$START, dataset_noempl$STOP, dataset_noempl$EVENT)

#table 1 regression: not employed
table_1_noempl <- coxph(surv_table_1_noempl ~ NATURALISED + NATURALISATION + as.factor(GENDER) + AGEARRIVAL + AGEARRIVAL_SQR +
                        as.factor(PARTNER) + CHILDREC + INCOME_HH + as.factor(URBANISATION) + DEVELOPMENTREC +
                        EU + post2008, data = dataset_noempl)


###########     
# Table 2 #
###########

#select immigrants with employment
dataset_empl <- subset(dataset_main, EMPLOYMENT == 1)

#compute the survival function
dataset_empl$surv_table_2_empl <- Surv(dataset_empl$START, dataset_empl$STOP, dataset_empl$EVENT)

#table 2 regression
table_2 <- coxph(surv_table_2_empl ~ NATURALISED + NATURALISATION:NATIVE_PARTNER + as.factor(GENDER) + AGEARRIVAL + AGEARRIVAL_SQR +
                        as.factor(PARTNER) + CHILDREC + INCOME_HH + EMPLOYMENT_DUR + as.factor(URBANISATION) + DEVELOPMENTREC +
                        EU + POST2008, data = dataset_empl)


###########     
# Table 3 #
###########

#select immigrants with employment and information for cultural distance
dataset_empl_cult <- subset(dataset_main, EMPLOYMENT == 1 & CULT_DIST >= 0)

#compute the survival function
dataset_empl_cult$surv_table_3_empl <- Surv(dataset_empl_cult$START, dataset_empl_cult$STOP, dataset_empl_cult$EVENT)

#table 3 regression
table_3 <- coxph(surv_table_3_empl ~ NATURALISED + NATURALISATION:CULT_DIST + as.factor(GENDER) + AGEARRIVAL + AGEARRIVAL_SQR +
                   as.factor(PARTNER) + CHILDREC + INCOME_HH + EMPLOYMENT_DUR + as.factor(URBANISATION) + DEVELOPMENTREC +
                   EU + POST2008, data = dataset_empl_cult)


###########     
# Table 4 #
###########

#select immigrants with employment
dataset_empl <- subset(dataset_main, EMPLOYMENT == 1)

#compute the survival function
dataset_empl$surv_table_4_empl <- Surv(dataset_empl$START, dataset_empl$STOP, dataset_empl$EVENT)

#table 4 regression
table_4 <- coxph(surv_table_4_empl ~ NATURALISED + NATURALISATION:POST2008 + as.factor(GENDER) + AGEARRIVAL + AGEARRIVAL_SQR +
                   as.factor(PARTNER) + CHILDREC + INCOME_HH + EMPLOYMENT_DUR + as.factor(URBANISATION) + DEVELOPMENTREC +
                   EU, data = dataset_empl)


############     
# Table A1 #
############

#select last observation
dataset_main_last_obs <- dataset_main %>% 
  group_by(ID) %>%
  summarise(lastocc = last(YEAR))

#proportion homeownership (last observation per individual) by subgroups
prop.table(dataset_main_last_obs$NATUALISED,dataset_main_last_obs$EVENT)
prop.table(dataset_main_last_obs$NATURALISATION,dataset_main_last_obs$EVENT)
prop.table(dataset_main_last_obs$GENDER,dataset_main_last_obs$EVENT)
prop.table(dataset_main_last_obs$AGEARRIVAL_CAT,dataset_main_last_obs$EVENT)
prop.table(dataset_main_last_obs$PARTNER,dataset_main_last_obs$EVENT)
prop.table(dataset_main_last_obs$CHILD,dataset_main_last_obs$EVENT)
prop.table(dataset_main_last_obs$EMPLOYMENT,dataset_main_last_obs$EVENT)
prop.table(dataset_main_last_obs$INCOME_HH,dataset_main_last_obs$EVENT)
prop.table(dataset_main_last_obs$URBANISATION,dataset_main_last_obs$EVENT)
prop.table(dataset_main_last_obs$DEVCAT,dataset_main_last_obs$EVENT)
prop.table(dataset_main_last_obs$EU,dataset_main_last_obs$EVENT)
prop.table(dataset_main_last_obs$POST2008,dataset_main_last_obs$EVENT)
prop.table(dataset_main_last_obs$EVENT)


############     
# Table A2 #
############

#select immigrants without employment
dataset_noempl <- subset(dataset_main, EMPLOYMENT == 0)

#compute the survival function
dataset_noempl$surv_table_A2_noempl <- Surv(dataset_noempl$START, dataset_noempl$STOP, dataset_noempl$EVENT)

#table A2 regression: interaction native partner
table_A2_model1 <- coxph(surv_table_A2_noempl ~ NATURALISED + NATURALISATION:NATIVE_PARTNER + as.factor(GENDER) + AGEARRIVAL + AGEARRIVAL_SQR +
                   as.factor(PARTNER) + CHILDREC + INCOME_HH + as.factor(URBANISATION) + DEVELOPMENTREC +
                   EU + POST2008, data = dataset_noempl)

#table A2 regression: interaction post 2008
table_A2_model2 <- coxph(surv_table_A2_noempl ~ NATURALISED + NATURALISATION:CULT_DIST + as.factor(GENDER) + AGEARRIVAL + AGEARRIVAL_SQR +
                           as.factor(PARTNER) + CHILDREC + INCOME_HH + as.factor(URBANISATION) + DEVELOPMENTREC +
                           EU + POST2008, data = dataset_noempl)

#table A2 regression: interaction cultural distance
table_A2_model3 <- coxph(surv_table_A2_noempl ~ NATURALISED + NATURALISATION:POST2008 + as.factor(GENDER) + AGEARRIVAL + AGEARRIVAL_SQR +
                           as.factor(PARTNER) + CHILDREC + INCOME_HH + as.factor(URBANISATION) + DEVELOPMENTREC +
                           EU, data = dataset_noempl)


############     
# Table A3 #
############

#select immigrants with employment
dataset_empl <- subset(dataset_main, EMPLOYMENT == 1)

#lag dependent variable
dataset_empl$HOMEOWNERSHIPLAG <- lag(dataset_empl$HOMEOWNERSHIP, n = 1L)

#table A3 regression: plm 
table_A3_empl <- plm(HOMEOWNERSHIP ~ NATURALISED + YSM + HOMEOWNERSHIPLAG + as.factor(PARTNER) + 
                       CHILDREC + INCOME_HH + as.factor(URBANISATION) + post2008, 
                     data = dataset_empl, 
                     model = 'within', 
                     effect = 'individual', 
                     index = c("ID","YEAR"))


############     
# Table A4 #
############

#select immigrants with employment
dataset_empl <- subset(dataset_main, EMPLOYMENT == 1)

#lag partner variable
dataset_empl$PARTNER_ID_LAG <- lag(dataset_empl$PARTNER_ID, n = 1L)

# if shift in partner 1, and 0 otherwise
dataset_empl$PARTNER_SHIFT <- ifelse(dataset_empl$PARTNER_ID != dataset_empl$PARTNER_ID_LAG, 1, 0)

#compute the survival function
dataset_empl$surv_table_A4_empl <- Surv(dataset_empl$START, dataset_empl$STOP, dataset_empl$EVENT)

#table A4 regression
table_A4_empl <- coxph(surv_table_A4_empl ~ NATURALISED + NATURALISATION + as.factor(GENDER) + AGEARRIVAL + AGEARRIVAL_SQR +
                        as.factor(PARTNER) + PARTNER_SHIFT + CHILDREC + INCOME_HH + EMPLOYMENT_DUR + as.factor(URBANISATION) + 
                        DEVELOPMENTREC + EU + post2008, data = dataset_empl)


############     
# Table A5 #
############

#select immigrants with employment and information on education
dataset_empl_edu <- subset(dataset_main, EMPLOYMENT == 1 & EDUCATION >= 1)

#compute the survival function
dataset_empl_edu$surv_table_A5_empl_edu <- Surv(dataset_empl_edu$START, dataset_empl_edu$STOP, dataset_empl_edu$EVENT)

#table A5 regression
table_A5_empl_edu <- coxph(surv_table_A5_empl_edu ~ NATURALISED + NATURALISATION + as.factor(GENDER) + AGEARRIVAL + AGEARRIVAL_SQR +
                         as.factor(PARTNER) + CHILDREC + INCOME_HH + EMPLOYMENT_DUR + as.factor(URBANISATION) + 
                         DEVELOPMENTREC + EU + post2008 + as.factor(EDUCATION), data = dataset_empl_edu)


############     
# Table A6 #
############

#Select immigrants with employment and information on education
dataset_empl_trunc <- subset(dataset_main, EMPLOYMENT == 1 & TRUNCATION == 0)

#compute the survival function
dataset_empl_trunc$surv_table_A6_empl_trunc <- Surv(dataset_empl_trunc$START, dataset_empl_trunc$STOP, dataset_empl_trunc$EVENT)

#table A6 regression
table_A6_empl_trunc <- coxph(surv_table_A6_empl_trunc ~ NATURALISED + NATURALISATION + as.factor(GENDER) + AGEARRIVAL + AGEARRIVAL_SQR +
                             as.factor(PARTNER) + CHILDREC + INCOME_HH + EMPLOYMENT_DUR + as.factor(URBANISATION) + 
                             DEVELOPMENTREC + EU + post2008, data = dataset_empl_trunc)


